{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from keras.models import Sequential\n",
    "from keras.layers.core import Dense, Dropout, Activation\n",
    "from keras.layers.recurrent import LSTM\n",
    "from keras.optimizers import SGD\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from keras.utils import np_utils\n",
    "from sklearn.metrics import precision_recall_fscore_support\n",
    "from keras.callbacks import TensorBoard\n",
    "import tensorflow as tf\n",
    "\n",
    "os.chdir('../Utils/')\n",
    "import featureGenerator\n",
    "from featureGenerator import *\n",
    "os.chdir('../src/')\n",
    "import orderbook_lstm\n",
    "from orderbook_lstm import OrderBookLSTM"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate Features and Response Vars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_dir = '../../../../ProjectData/'\n",
    "in_path = data_dir+'msft-orderbook.csv'\n",
    "out_path = data_dir+'msft-orderbook-all.csv'\n",
    "out_path2 = data_dir+'msft-orderbook-final.csv'\n",
    "response_type = \"Regression\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['msft-20170417-orderbook.csv', 'msft-20170418-orderbook.csv', 'msft-20170419-orderbook.csv', 'msft-20170420-orderbook.csv', 'msft-20170421-orderbook.csv', 'msft-20170424-orderbook.csv', 'msft-20170425-orderbook.csv', 'msft-20170426-orderbook.csv', 'msft-20170427-orderbook.csv', 'msft-20170428-orderbook.csv']\n",
      "../../../../ProjectData/msft-20170417-orderbook.csv\n",
      "../../../../ProjectData/msft-20170418-orderbook.csv\n",
      "../../../../ProjectData/msft-20170419-orderbook.csv\n",
      "../../../../ProjectData/msft-20170420-orderbook.csv\n",
      "../../../../ProjectData/msft-20170421-orderbook.csv\n",
      "../../../../ProjectData/msft-20170424-orderbook.csv\n",
      "../../../../ProjectData/msft-20170425-orderbook.csv\n",
      "../../../../ProjectData/msft-20170426-orderbook.csv\n",
      "../../../../ProjectData/msft-20170427-orderbook.csv\n",
      "../../../../ProjectData/msft-20170428-orderbook.csv\n"
     ]
    }
   ],
   "source": [
    "mergeOrderBookDays(data_dir, out_path, ['msft'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/adam/Desktop/Stanford/MSE 448/448Project/448Project/LSTM/Utils/featureGenerator.py:79: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  data['Response'] = response_col\n"
     ]
    }
   ],
   "source": [
    "createFeatures(out_path, out_path2, response_type = response_type, look_forward =1)\n",
    "data = pd.read_csv(out_path2)\n",
    "data_orig = pd.read_csv(out_path2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>direct.vwap</th>\n",
       "      <th>meanBid</th>\n",
       "      <th>meanAsk</th>\n",
       "      <th>meanBidNum</th>\n",
       "      <th>meanAskNum</th>\n",
       "      <th>meanBidVol</th>\n",
       "      <th>meanAskVol</th>\n",
       "      <th>accumulatedPriceDiff</th>\n",
       "      <th>accumulatedVolumeDiff</th>\n",
       "      <th>spread_1</th>\n",
       "      <th>spread_2</th>\n",
       "      <th>spread_3</th>\n",
       "      <th>spread_4</th>\n",
       "      <th>spread_5</th>\n",
       "      <th>spread_6</th>\n",
       "      <th>spread_7</th>\n",
       "      <th>spread_8</th>\n",
       "      <th>spread_9</th>\n",
       "      <th>spread_10</th>\n",
       "      <th>midPrice_1</th>\n",
       "      <th>midPrice_2</th>\n",
       "      <th>midPrice_3</th>\n",
       "      <th>midPrice_4</th>\n",
       "      <th>midPrice_5</th>\n",
       "      <th>midPrice_6</th>\n",
       "      <th>midPrice_7</th>\n",
       "      <th>midPrice_8</th>\n",
       "      <th>midPrice_9</th>\n",
       "      <th>midPrice_10</th>\n",
       "      <th>Response</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>65.100361</td>\n",
       "      <td>64.924</td>\n",
       "      <td>65.288</td>\n",
       "      <td>2.7</td>\n",
       "      <td>1.7</td>\n",
       "      <td>2921.6</td>\n",
       "      <td>997.0</td>\n",
       "      <td>3.64</td>\n",
       "      <td>-19246</td>\n",
       "      <td>0.14</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.26</td>\n",
       "      <td>0.35</td>\n",
       "      <td>0.41</td>\n",
       "      <td>0.46</td>\n",
       "      <td>0.48</td>\n",
       "      <td>0.54</td>\n",
       "      <td>0.57</td>\n",
       "      <td>65.100</td>\n",
       "      <td>65.095</td>\n",
       "      <td>65.080</td>\n",
       "      <td>65.080</td>\n",
       "      <td>65.095</td>\n",
       "      <td>65.105</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.130</td>\n",
       "      <td>65.135</td>\n",
       "      <td>65.081990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>65.081990</td>\n",
       "      <td>64.970</td>\n",
       "      <td>65.208</td>\n",
       "      <td>6.2</td>\n",
       "      <td>2.8</td>\n",
       "      <td>2439.2</td>\n",
       "      <td>1263.6</td>\n",
       "      <td>2.38</td>\n",
       "      <td>-11756</td>\n",
       "      <td>0.08</td>\n",
       "      <td>0.13</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.24</td>\n",
       "      <td>0.27</td>\n",
       "      <td>0.31</td>\n",
       "      <td>0.35</td>\n",
       "      <td>0.45</td>\n",
       "      <td>65.060</td>\n",
       "      <td>65.075</td>\n",
       "      <td>65.075</td>\n",
       "      <td>65.080</td>\n",
       "      <td>65.090</td>\n",
       "      <td>65.090</td>\n",
       "      <td>65.095</td>\n",
       "      <td>65.105</td>\n",
       "      <td>65.095</td>\n",
       "      <td>65.125</td>\n",
       "      <td>65.097505</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>65.097505</td>\n",
       "      <td>65.044</td>\n",
       "      <td>65.185</td>\n",
       "      <td>5.6</td>\n",
       "      <td>2.8</td>\n",
       "      <td>4393.5</td>\n",
       "      <td>790.1</td>\n",
       "      <td>1.41</td>\n",
       "      <td>-36034</td>\n",
       "      <td>0.04</td>\n",
       "      <td>0.06</td>\n",
       "      <td>0.08</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.13</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.20</td>\n",
       "      <td>0.22</td>\n",
       "      <td>0.24</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.110</td>\n",
       "      <td>65.110</td>\n",
       "      <td>65.110</td>\n",
       "      <td>65.110</td>\n",
       "      <td>65.099488</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>65.099488</td>\n",
       "      <td>65.048</td>\n",
       "      <td>65.185</td>\n",
       "      <td>5.4</td>\n",
       "      <td>2.6</td>\n",
       "      <td>4300.8</td>\n",
       "      <td>319.6</td>\n",
       "      <td>1.37</td>\n",
       "      <td>-39812</td>\n",
       "      <td>0.04</td>\n",
       "      <td>0.06</td>\n",
       "      <td>0.08</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.13</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.17</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.21</td>\n",
       "      <td>0.23</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.115</td>\n",
       "      <td>65.101385</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>65.101385</td>\n",
       "      <td>65.070</td>\n",
       "      <td>65.175</td>\n",
       "      <td>2.7</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2587.2</td>\n",
       "      <td>482.9</td>\n",
       "      <td>1.05</td>\n",
       "      <td>-21043</td>\n",
       "      <td>0.01</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.07</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.12</td>\n",
       "      <td>0.14</td>\n",
       "      <td>0.16</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.20</td>\n",
       "      <td>65.125</td>\n",
       "      <td>65.125</td>\n",
       "      <td>65.125</td>\n",
       "      <td>65.125</td>\n",
       "      <td>65.125</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.120</td>\n",
       "      <td>65.101807</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   direct.vwap  meanBid  meanAsk  meanBidNum  meanAskNum  meanBidVol  \\\n",
       "0    65.100361   64.924   65.288         2.7         1.7      2921.6   \n",
       "1    65.081990   64.970   65.208         6.2         2.8      2439.2   \n",
       "2    65.097505   65.044   65.185         5.6         2.8      4393.5   \n",
       "3    65.099488   65.048   65.185         5.4         2.6      4300.8   \n",
       "4    65.101385   65.070   65.175         2.7         3.0      2587.2   \n",
       "\n",
       "   meanAskVol  accumulatedPriceDiff  accumulatedVolumeDiff  spread_1  \\\n",
       "0       997.0                  3.64                 -19246      0.14   \n",
       "1      1263.6                  2.38                 -11756      0.08   \n",
       "2       790.1                  1.41                 -36034      0.04   \n",
       "3       319.6                  1.37                 -39812      0.04   \n",
       "4       482.9                  1.05                 -21043      0.01   \n",
       "\n",
       "   spread_2  spread_3  spread_4  spread_5  spread_6  spread_7  spread_8  \\\n",
       "0      0.19      0.24      0.26      0.35      0.41      0.46      0.48   \n",
       "1      0.13      0.15      0.18      0.22      0.24      0.27      0.31   \n",
       "2      0.06      0.08      0.11      0.13      0.15      0.18      0.20   \n",
       "3      0.06      0.08      0.11      0.13      0.15      0.17      0.19   \n",
       "4      0.03      0.05      0.07      0.09      0.12      0.14      0.16   \n",
       "\n",
       "   spread_9  spread_10  midPrice_1  midPrice_2  midPrice_3  midPrice_4  \\\n",
       "0      0.54       0.57      65.100      65.095      65.080      65.080   \n",
       "1      0.35       0.45      65.060      65.075      65.075      65.080   \n",
       "2      0.22       0.24      65.120      65.120      65.120      65.115   \n",
       "3      0.21       0.23      65.120      65.120      65.120      65.115   \n",
       "4      0.18       0.20      65.125      65.125      65.125      65.125   \n",
       "\n",
       "   midPrice_5  midPrice_6  midPrice_7  midPrice_8  midPrice_9  midPrice_10  \\\n",
       "0      65.095      65.105      65.120      65.120      65.130       65.135   \n",
       "1      65.090      65.090      65.095      65.105      65.095       65.125   \n",
       "2      65.115      65.115      65.110      65.110      65.110       65.110   \n",
       "3      65.115      65.115      65.115      65.115      65.115       65.115   \n",
       "4      65.125      65.120      65.120      65.120      65.120       65.120   \n",
       "\n",
       "    Response  \n",
       "0  65.081990  \n",
       "1  65.097505  \n",
       "2  65.099488  \n",
       "3  65.101385  \n",
       "4  65.101807  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.set_option('display.max_columns', 500)\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv(out_path2)\n",
    "#data = data.drop(['datetime', 'direct.last_SRO'], axis = 1)\n",
    "\n",
    "# normalize the dataset\n",
    "scaler = MinMaxScaler(feature_range=(0, 1))\n",
    "cols_to_normalize = [col for col in data.columns if col != 'Response']\n",
    "data[cols_to_normalize] = scaler.fit_transform(data[cols_to_normalize])\n",
    "\n",
    "dataset = data.values\n",
    "dataset = dataset.astype('float32')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "f, ax = plt.subplots(figsize=(20, 20))\n",
    "corr = data.corr()\n",
    "sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),\n",
    "            square=True, ax=ax)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Plots\n",
    "data_orig = pd.read_csv(out_path)\n",
    "\n",
    "plt.plot(np.arange(len(data_orig)), data_orig['direct.bsize1'])\n",
    "plt.plot(np.arange(len(data_orig)),data_orig['direct.bsize2'])\n",
    "plt.plot(np.arange(len(data_orig)),data_orig['direct.bsize3'])\n",
    "plt.plot(np.arange(len(data_orig)),data_orig['direct.bsize4'])\n",
    "plt.plot(np.arange(len(data_orig)),data_orig['direct.bsize5'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train/Test Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "156779 77220\n"
     ]
    }
   ],
   "source": [
    "# split into train and test sets\n",
    "train_size = int(len(dataset) * 0.67)\n",
    "test_size = len(dataset) - train_size\n",
    "train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]\n",
    "print(len(train), len(test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# convert an array of values into a dataset matrix\n",
    "def create_dataset(dataset, look_back, response_type):\n",
    "    if response_type == 'Classification':\n",
    "        dataY = get_one_hot(dataset[look_back+1:,dataset.shape[1]-1].astype(int).reshape(-1),3)\n",
    "        dataX = []\n",
    "        for i in range(len(dataset)-look_back-1):\n",
    "            a = dataset[i:(i+look_back), :]    \n",
    "            dataX.append(a)\n",
    "        return np.array(dataX), np.array(dataY)\n",
    "    \n",
    "    elif response_type == 'Regression':\n",
    "        dataX, dataY = [], []\n",
    "        for i in range(len(dataset)-look_back-1):\n",
    "            a = dataset[i:(i+look_back), :]\n",
    "            dataX.append(a)\n",
    "            dataY.append(dataset[i, dataset.shape[1]-1])\n",
    "            #print(i + look_back)\n",
    "        return np.array(dataX), np.array(dataY)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Convert response variable to one-hot vectors\n",
    "def get_one_hot(targets, nb_classes):\n",
    "    return np.eye(nb_classes)[np.array(targets).reshape(-1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "look_back = 30\n",
    "trainX, trainY = create_dataset(train, look_back, response_type = response_type)\n",
    "testX, testY = create_dataset(test, look_back, response_type = response_type)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(30, 30)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainX[1].shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building regression model...\n",
      "Compiling model...\n"
     ]
    }
   ],
   "source": [
    "from importlib import reload\n",
    "os.chdir('../src/')\n",
    "import orderbook_lstm\n",
    "from orderbook_lstm import OrderBookLSTM\n",
    "reload(orderbook_lstm)\n",
    "\n",
    "#tf.reset_default_graph()\n",
    "from keras import backend as K\n",
    "K.clear_session()\n",
    "#timesteps = 10\n",
    "n_features = 69\n",
    "n_neurons = 100\n",
    "n_classes = 3\n",
    "n_hidden = 1\n",
    "dropout = None\n",
    "\n",
    "#lstm = OrderBookLSTM(lookback, n_neurons, (timesteps,n_features), n_classes, n_hidden, response_type, dropout)\n",
    "\n",
    "lstm = OrderBookLSTM(timesteps = look_back, \n",
    "                     layer_neurons = 100, \n",
    "                     input_shape = (look_back,trainX.shape[2]), \n",
    "                     output_shape = 3, # doesn't matter if regression\n",
    "                     num_hidden_layers = 0, \n",
    "                     response_type = response_type)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mod = lstm.get_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "lstm_1 (LSTM)                (None, 30, 100)           52400     \n",
      "_________________________________________________________________\n",
      "flatten_1 (Flatten)          (None, 3000)              0         \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 1)                 3001      \n",
      "=================================================================\n",
      "Total params: 55,401\n",
      "Trainable params: 55,401\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "mod.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 156748 samples, validate on 77189 samples\n",
      "Epoch 1/5\n",
      "156748/156748 [==============================] - 68s 433us/step - loss: 133.1387 - val_loss: 3.8153\n",
      "Epoch 2/5\n",
      "156748/156748 [==============================] - 69s 442us/step - loss: 0.4119 - val_loss: 0.0306\n",
      "Epoch 3/5\n",
      "156748/156748 [==============================] - 70s 444us/step - loss: 0.0088 - val_loss: 0.0232\n",
      "Epoch 4/5\n",
      "156748/156748 [==============================] - 70s 449us/step - loss: 0.0079 - val_loss: 0.0201\n",
      "Epoch 5/5\n",
      "156748/156748 [==============================] - 72s 459us/step - loss: 0.0062 - val_loss: 0.0259\n"
     ]
    }
   ],
   "source": [
    "# Class weights to change\n",
    "class_weight = {0 : 1.,\n",
    "    1: 1.,\n",
    "    2: 1.} \n",
    "\n",
    "if response_type == 'Classification':\n",
    "    mod.fit(trainX, trainY, validation_data=([testX, testY]), \n",
    "              epochs=5,  \n",
    "              batch_size=128, \n",
    "              verbose=1, \n",
    "              class_weight = class_weight,\n",
    "              callbacks=[TensorBoard(log_dir='Logs/', write_graph=True)])\n",
    "    \n",
    "\n",
    "if response_type == 'Regression':\n",
    "    mod.fit(trainX, trainY, validation_data=([testX, testY]),\n",
    "              epochs=5,  \n",
    "              batch_size=256, \n",
    "              verbose=1, \n",
    "              callbacks=[TensorBoard(log_dir='Logs/', write_graph=True)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "unique, counts = np.unique(dataset[:,68], return_counts=True)\n",
    "counts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Make Predictions and get Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Training Error\n",
    "if response_type == 'Classification':\n",
    "    preds_training = mod.predict(trainX).argmax(axis=-1)\n",
    "    pd.Series(preds_training).value_counts()\n",
    "    precision_recall_fscore_support(np.argmax(trainY, axis=1), preds_training)\n",
    "else:\n",
    "    preds_training = mod.predict(trainX)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Validation Error\n",
    "if response_type == 'Classification':\n",
    "    preds = mod.predict(testX).argmax(axis=-1)\n",
    "    pd.Series(preds).value_counts()\n",
    "    print(precision_recall_fscore_support(np.argmax(testY, axis=1), preds))\n",
    "    preds_probs = mod.predict(testX)\n",
    "else:\n",
    "    preds_test = mod.predict(testX)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix\n",
    "import seaborn as sn\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "cm = confusion_matrix(class_labels, preds)\n",
    "df_cm = pd.DataFrame(cm, range(3),\n",
    "                  range(3))\n",
    "sn.set(font_scale=1.4)#for label size\n",
    "sn.heatmap(df_cm, annot=True,annot_kws={\"size\": 10})# font size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "l = list(preds)\n",
    "Counter(l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len(preds)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate PnL (+1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "starting_money = 10000\n",
    "pnl_lstm = 0\n",
    "buffer = 0.05\n",
    "pnl_history_lstm = []\n",
    "pnl_history_by_trade = []\n",
    "\n",
    "if response_type == 'Regression':\n",
    "    for i in range(len(testX)):\n",
    "        pred = preds_test[i][0]\n",
    "        current = testY[i] \n",
    "        \n",
    "        '''\n",
    "        At time t:\n",
    "         - if pred > current:\n",
    "             - buy at ask price\n",
    "         \n",
    "         - if pred < current:\n",
    "             - sell at bid price\n",
    "             \n",
    "             \n",
    "        At time t+1:\n",
    "        Liquidate assets\n",
    "        \n",
    "         - if bought at time t:\n",
    "             - sell at bid price at t + 1\n",
    "             \n",
    "         - if sold at time t:\n",
    "             - buy at bid price at t + 1\n",
    "             \n",
    "        '''\n",
    "        buy_bool = None\n",
    "        buy_price = None\n",
    "        sell_price = None\n",
    "        amount_buy = 1\n",
    "        amount_sell = 1\n",
    "        \n",
    "        \n",
    "        # buy\n",
    "        if pred > (current + buffer):\n",
    "            '''\n",
    "            - Buy at ask price\n",
    "            - \n",
    "            '''\n",
    "            buy_bool = True\n",
    "            buy_price = data_orig.loc[i, 'direct.ask1']\n",
    "            amount_buy = 1000\n",
    "        # sell\n",
    "        elif pred < (current - buffer):\n",
    "            '''\n",
    "            Sell at bid price\n",
    "            '''\n",
    "            buy_bool = False\n",
    "            sell_price = data_orig.loc[i, 'direct.bid1']\n",
    "            amount_sell = 1000\n",
    "        else:\n",
    "            continue\n",
    "            \n",
    "        \n",
    "        # liquidate\n",
    "        if buy_bool == True:\n",
    "            ask_next = data_orig.loc[i+1, 'direct.ask1']\n",
    "            pnl_lstm += (ask_next - buy_price) * amount_buy\n",
    "            pnl_history_by_trade.append((ask_next - buy_price) * amount_buy)\n",
    "            \n",
    "        else:\n",
    "            bid_next = data_orig.loc[i+1, 'direct.bid1']\n",
    "            pnl_lstm += (bid_next - sell_price) * amount_sell\n",
    "            pnl_history_by_trade.append((bid_next - sell_price) * amount_sell)\n",
    "        \n",
    "        pnl_history_lstm.append(pnl_lstm) \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# VWAP Crossing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "starting_money = 10000\n",
    "pnl_lstm = 0\n",
    "buffer = 0.05\n",
    "pnl_history_lstm = []\n",
    "pnl_history_by_trade = []\n",
    "\n",
    "inventory = 1000\n",
    "inventory_history = [1000]\n",
    "\n",
    "test_data = data_orig.loc[train_size:len(data_orig),:].reset_index(drop = True)\n",
    "\n",
    "\n",
    "n = len(testX)\n",
    "n = 50\n",
    "if response_type == 'Regression':\n",
    "    for i in range(n):\n",
    "        pred = preds_test[i][0]\n",
    "        current_ask = test_data.loc[i, 'direct.ask1']\n",
    "        current_bid = test_data.loc[i, 'direct.bid1']\n",
    "\n",
    "        '''\n",
    "        At time t:\n",
    "         - if pred > current_ask:\n",
    "             - buy at ask price\n",
    "         \n",
    "         - if pred < current_bid:\n",
    "             - sell at bid price\n",
    "             \n",
    "             \n",
    "        At time t+1:\n",
    "        Liquidate assets\n",
    "        \n",
    "         - if bought at time t:\n",
    "             - sell at bid price at t + 1\n",
    "             \n",
    "         - if sold at time t:\n",
    "             - buy at bid price at t + 1\n",
    "             \n",
    "        '''\n",
    "        \n",
    "        \n",
    "        # buy\n",
    "        if pred > (current_ask + buffer):\n",
    "            '''\n",
    "            - Buy at ask price\n",
    "            '''\n",
    "            amount_buy = 100\n",
    "            inventory += amount_buy\n",
    "            inventory_history.append(inventory_history[len(inventory_history)-1] + amount_buy)\n",
    "            buy_bool = True\n",
    "            \n",
    "        # sell\n",
    "        elif pred < (current_bid - buffer):\n",
    "            '''\n",
    "            Sell at bid price\n",
    "            '''\n",
    "            amount_sell = 100\n",
    "            inventory -= amount_sell\n",
    "            inventory_history.append(inventory_history[len(inventory_history)-1] - amount_sell)\n",
    "            buy_bool = False\n",
    "        else:\n",
    "            continue\n",
    "            \n",
    "        \n",
    "        # liquidate\n",
    "        #if i % 60 == 0 and i > 0:\n",
    "        if buy_bool == True:\n",
    "            \n",
    "            ask_next = data_orig.loc[i+1, 'direct.ask1']\n",
    "            pnl_lstm += (ask_next - current_ask) * amount_buy\n",
    "            pnl_history_by_trade.append((ask_next - current_ask) * amount_buy)\n",
    "            \n",
    "        else:\n",
    "            bid_next = data_orig.loc[i+1, 'direct.bid1']\n",
    "            pnl_lstm += (bid_next - current_bid) * amount_sell\n",
    "            pnl_history_by_trade.append((bid_next - current_bid) * amount_sell)\n",
    "        \n",
    "        pnl_history_lstm.append(pnl_lstm) \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.plot(pnl_history_lstm)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "inventory_history"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PnL (Classification)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "starting_money = 100000\n",
    "stop_loss = -50000\n",
    "pnl_lstm = 0\n",
    "pnl_history_lstm = []\n",
    "pnl_history_by_trade = []\n",
    "prob_limit = 0.8\n",
    "inventory = [1000]\n",
    "\n",
    "test_data = data_orig.loc[train_size:len(data_orig),:].reset_index(drop = True)\n",
    "\n",
    "if response_type == 'Classification':\n",
    "    for i in range(len(testX)):\n",
    "        pred = preds[i]\n",
    "        prob = preds_probs[i][pred]\n",
    "        if prob < prob_limit:\n",
    "            continue\n",
    "\n",
    "        '''\n",
    "        At time t:\n",
    "         - if pred > current:\n",
    "             - buy at ask price\n",
    "         \n",
    "         - if pred < current:\n",
    "             - sell at bid price\n",
    "             \n",
    "             \n",
    "        At time t+1:\n",
    "        Liquidate assets\n",
    "        \n",
    "         - if bought at time t:\n",
    "             - sell at bid price at t + 1\n",
    "             \n",
    "         - if sold at time t:\n",
    "             - buy at bid price at t + 1\n",
    "             \n",
    "        '''\n",
    "        #buy_bool = None\n",
    "        #buy_price = None\n",
    "        #sell_price = None\n",
    "        #amount_buy = 1\n",
    "        #amount_sell = 1\n",
    "        \n",
    "        # buy\n",
    "        if pred == 1:\n",
    "            '''\n",
    "            - Buy at ask price\n",
    "            - \n",
    "            '''\n",
    "            buy_price = test_data.loc[i, 'direct.ask1']\n",
    "            amount_buy = 100*prob\n",
    "            buy_bool = True\n",
    "            \n",
    "        \n",
    "        # sell\n",
    "        elif pred == 2:\n",
    "            '''\n",
    "            Sell at bid price\n",
    "            '''\n",
    "            #if inventory - 100*prob >= 0:\n",
    "            sell_price = test_data.loc[i, 'direct.bid1']\n",
    "            amount_sell = 100*prob\n",
    "            buy_bool = False\n",
    "        else:\n",
    "            continue\n",
    "            \n",
    "        \n",
    "        if buy_bool == True:\n",
    "            ask_next = test_data.loc[i+1, 'direct.ask1']\n",
    "            pnl_lstm += (ask_next - buy_price) * amount_buy\n",
    "            pnl_history_by_trade.append((ask_next - buy_price) * amount_buy)\n",
    "\n",
    "        else:\n",
    "            bid_next = test_data.loc[i+1, 'direct.bid1']\n",
    "            pnl_lstm += (bid_next - sell_price) * amount_sell\n",
    "            pnl_history_by_trade.append((bid_next - sell_price) * amount_sell)\n",
    "\n",
    "        \n",
    "        \n",
    "        pnl_history_lstm.append(pnl_lstm) \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.plot(pnl_history_lstm)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len([x for x in pnl_history_by_trade if x > 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len([x for x in pnl_history_by_trade if x < 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Naive strategy: predict mid price/vwap/AREA for the next time period. If we predict the this metric will go up, we buy. If we predict the metric will go down, we sell. We close positions after each time period. Calculate PnL for that period as ____"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ignore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Class weights to change\n",
    "class_weight = {0 : 1.,\n",
    "    1: 15.,\n",
    "    2: 18.} \n",
    "\n",
    "# create and fit the LSTM network\n",
    "model = Sequential()\n",
    "model.add(LSTM(10, input_shape=(5,69), return_sequences=False))\n",
    "model.add(Dense(3, activation = 'softmax'))\n",
    "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])\n",
    "model.fit(trainX, trainY, \n",
    "          epochs=2,  \n",
    "          batch_size=10, \n",
    "          #verbose=2, \n",
    "          class_weight = class_weight,\n",
    "          callbacks=[TensorBoard(log_dir='Logs/testlog', write_graph=True)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "preds = model.predict(testX).argmax(axis=-1)\n",
    "pd.Series(preds).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "precision_recall_fscore_support(np.argmax(testY, axis=1), preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
